#load required packages
source("modified_herachp.R")
library(ggplot2)

#load required data
load("years.RData")
load("names.RData")

#get list of years in dataset
t <- c(min(unique(unlist(years))):max(unique(unlist(years))))

#construct frequency matrix
freq_matrix <- matrix(data = 0, nrow = length(t), ncol = length(years))
i <- 1
while(i <= length(t)){
  temp <- grep(t[i], years)
  if(length(temp) > 0){
    j <- 1
    while(j <= length(temp)){
      freq_matrix[i, temp[j]] <- freq_matrix[i, temp[j]] + length(grep(t[i], years[[temp[j]]]))
      j <- j + 1
    }
  }
  i <- i + 1
}

#get population size over time
pop_size <- list()
i <- 1
while(i <= length(t)){
  temp_pop_size <- c()
  temp <- grep(t[i], years)
  j <- 1
  while(j <= length(temp)){
    temp_pop_size <- c(temp_pop_size, unique(unlist(names[[temp[j]]][grep(t[i], years[[temp[j]]])])))
    j <- j + 1
  }
  pop_size[[i]] <- length(unique(temp_pop_size))
  i <- i + 1
}

pop_size <- data.frame(t, unlist(pop_size))
colnames(pop_size) <- c("year", "pop_size")

#get total number of unique artists for analysis years (1987-2018)...
i <- 1
artists <- c()
while(i <= length(years)){
  artists <- c(artists, unlist(names[[i]][which(unlist(years[i]) %in% 1987:2018)]))
  i <- i + 1
}
length(unique(artists))

#subset matrix to only include years with more than 100 variants...
num_var <- c()
i <- 1
while(i <= nrow(freq_matrix)){
  num_var <- c(num_var, length(which(freq_matrix[i, ] > 0)))
  i <- i + 1
}
freq_matrix <- freq_matrix[29:60, ]

#remove variants that are only sampled in excluded years
freq_matrix <- freq_matrix[, -which(as.numeric(colSums(freq_matrix)) == 0)]

#add colnames to allow for turnover calculation
colnames(freq_matrix) <- c(1:ncol(freq_matrix))

#get mean population size across the used years
n <- round(mean(pop_size$pop_size[29:60]))

#calculate mu
i <- 1
num_inn_time <- c(rep(0, nrow(freq_matrix)))
while(i <= ncol(freq_matrix)){
  num_inn_time[[which(freq_matrix[, i] > 0)[1]]] <- num_inn_time[[which(freq_matrix[, i] > 0)[1]]] + 1 #for each variant, add 1 to the year in which it first appears
  i <- i + 1
}
i <- 1
mu_estimation <- c(rep(0, nrow(freq_matrix)))
while(i <= nrow(freq_matrix)){
  mu_estimation[i] <- num_inn_time[i]/sum(freq_matrix[i,]) #first (recommended) method recommended by Shennan and Wilkinson (2001)... number of new types per total number of variants in a timepoint
  i <- i + 1
}
mu <- mean(mu_estimation[-1])

#turnover, expected diversity, and b (drawn from transmission function in HERAChp.KandlerCrema)
top = min(apply(freq_matrix, 1, function(x) {
  sum(x > 0)
}))
zMatrix <- turnover(freq_matrix, top)
z = apply(zMatrix, 2, mean)
z.frame = data.frame(y = 1:top, obs.z = z)
aN = 1.38 * (mu^0.55) * n^0.13
z.frame$exp.z.Bentley = z.frame$y * sqrt(mu)
z.frame$exp.z.EvansGiometto = aN * (z.frame$y)^0.86/2
z.frame$zfitN = z.frame$obs.z/aN
modN = lm(log(zfitN + 1e-06) ~ log(y), data = z.frame)
b = as.numeric(coefficients(modN)[2])

#run Kolmogorov-Smirnov tests to see if distributsions are significantly different
ks.test(z.frame$obs.z, z.frame$exp.z.Bentley)
ks.test(z.frame$obs.z, z.frame$exp.z.EvansGiometto)

#observed diversity (drawn from transmission function in HERAChp.KandlerCrema)
obs.div <- 1 - apply(freq_matrix, 1, function(x) {
  sum(c(x/sum(x))^2)
})

#generate summary statistics for parameter estimation and model choice
sum_stats <- c(b, mean(obs.div), obs.div, colMeans(zMatrix))
